This analysis examines simulated income data for Hungary, focusing on the relationships between income and various demographic factors including age, location, occupation, and gender. The dataset is simulated to reflect realistic patterns while maintaining a manageable size for analysis.
# Summary statistics with kable
summary_stats <- summary(data)
kable(summary_stats, caption = "Summary Statistics of the Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| age | city | occupation | gender | income | |
|---|---|---|---|---|---|
| Min. : 0.00 | Length:10000 | Length:10000 | Length:10000 | Min. :-31517 | |
| 1st Qu.:23.00 | Class :character | Class :character | Class :character | 1st Qu.:132102 | |
| Median :37.00 | Mode :character | Mode :character | Mode :character | Median :338604 | |
| Mean :38.01 | NA | NA | NA | Mean :315491 | |
| 3rd Qu.:52.00 | NA | NA | NA | 3rd Qu.:485023 | |
| Max. :94.00 | NA | NA | NA | Max. :850812 |
library(forcats)
# Create 2-year age bins starting from 0
data <- data %>%
mutate(age_group = cut(age,
breaks = seq(0, 100, by = 2),
right = FALSE,
include.lowest = TRUE,
labels = seq(0, 98, by = 2)))
# Prepare data for detailed pyramid
dem_pyramid <- data %>%
group_by(age_group, gender) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(count = ifelse(gender == "Male", -count, count))
# Plot detailed population pyramid
ggplot(dem_pyramid, aes(x = age_group, y = count, fill = gender)) +
geom_bar(stat = "identity", width = 0.8, color = "black") +
scale_y_continuous(labels = abs, expand = expansion(mult = c(0.05, 0.05))) +
scale_fill_manual(values = c("Male" = "#00BFFF", "Female" = "#FF3B3B")) +
coord_flip() +
labs(title = "Population Pyramid of Simulated Hungarian Data",
x = "Age Group",
y = "Count",
fill = "Gender") +
custom_theme +
theme(legend.position = "top",
axis.text.y = element_text(size = 10, face = "bold"),
plot.margin = margin(t = 20, r = 20, b = 20, l = 20))# Income distribution by gender (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = income, fill = gender)) +
geom_density(alpha = 0.6) +
scale_fill_viridis_d() +
labs(title = "Income Distribution by Gender",
subtitle = "(working age only)",
x = "Income (HUF)",
y = "Density") +
custom_theme# Income by occupation with jitter (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = reorder(occupation, income, FUN = median), y = income, color = occupation)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(alpha = 0.1, width = 0.2) +
scale_color_viridis_d() +
coord_flip() +
labs(title = "Income Distribution by Occupation",
subtitle = "(working age only)",
x = "Occupation",
y = "Income (HUF)") +
custom_theme# Income by city with violin plot (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = reorder(city, income, FUN = median), y = income, fill = city)) +
geom_violin(alpha = 0.7) +
geom_boxplot(width = 0.2, alpha = 0.5) +
scale_fill_viridis_d() +
coord_flip() +
labs(title = "Income Distribution by City",
subtitle = "(working age only)",
x = "City",
y = "Income (HUF)") +
custom_theme# Age vs income scatter plot with regression line (working age only)
ggplot(data %>% filter(age <= 70 & age >= 18), aes(x = age, y = income, color = gender)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = TRUE) +
scale_color_viridis_d() +
labs(title = "Relationship between Age and Income",
subtitle = "(working age only)",
x = "Age",
y = "Income (HUF)") +
custom_theme# Calculate mean income by category (working age only)
income_by_category <- data %>%
filter(age <= 70 & age >= 18) %>%
group_by(occupation, city, gender) %>%
summarise(
mean_income = mean(income),
count = n(),
.groups = 'drop'
) %>%
arrange(desc(mean_income))
# Create a heatmap of mean income by occupation and city
ggplot(income_by_category, aes(x = city, y = occupation, fill = mean_income)) +
geom_tile() +
scale_fill_viridis(name = "Mean Income (HUF)") +
facet_wrap(~gender) +
labs(title = "Mean Income by Occupation, City, and Gender",
subtitle = "(working age only)",
x = "City",
y = "Occupation") +
custom_theme +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold"))# Create a summary table of the top 10 highest earning combinations (working age only)
top_earners <- income_by_category %>%
arrange(desc(mean_income)) %>%
head(10)
kable(top_earners,
caption = "Top 10 Highest Earning Combinations",
digits = 0) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| occupation | city | gender | mean_income | count |
|---|---|---|---|---|
| Software Developer | Budapest | Male | 568530 | 112 |
| Software Developer | Debrecen | Male | 542354 | 46 |
| Software Developer | Budapest | Female | 535991 | 108 |
| Doctor | Budapest | Female | 522413 | 70 |
| Software Developer | Debrecen | Female | 517588 | 48 |
| Manager | Budapest | Female | 516757 | 100 |
| Doctor | Budapest | Male | 516183 | 58 |
| Engineer | Budapest | Male | 512031 | 132 |
| Software Developer | Szeged | Male | 509836 | 25 |
| Software Developer | Eger | Male | 499559 | 11 |
# Test if there's a significant difference in income between genders
t_test_result <- t.test(income ~ gender, data = data)
print(t_test_result)##
## Welch Two Sample t-test
##
## data: income by gender
## t = -2.2421, df = 9988.2, p-value = 0.02497
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -17644.966 -1183.765
## sample estimates:
## mean in group Female mean in group Male
## 310770.4 320184.7
# Test if there are significant differences in income between cities
city_anova <- aov(income ~ city, data = data)
print(summary(city_anova))## Df Sum Sq Mean Sq F value Pr(>F)
## city 7 1.186e+13 1.694e+12 39.45 <2e-16 ***
## Residuals 9992 4.291e+14 4.295e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test if there are significant differences in income between occupations
occupation_anova <- aov(income ~ occupation, data = data)
print(summary(occupation_anova))## Df Sum Sq Mean Sq F value Pr(>F)
## occupation 9 1.242e+13 1.38e+12 32.18 <2e-16 ***
## Residuals 9990 4.286e+14 4.29e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test if income distributions differ between genders
male_income <- data$income[data$gender == "Male"]
female_income <- data$income[data$gender == "Female"]
ks_test <- ks.test(male_income, female_income)
print(ks_test)##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: male_income and female_income
## D = 0.033246, p-value = 0.007961
## alternative hypothesis: two-sided
# Test if income distributions differ between cities
kruskal_test <- kruskal.test(income ~ city, data = data)
print(kruskal_test)##
## Kruskal-Wallis rank sum test
##
## data: income by city
## Kruskal-Wallis chi-squared = 276.84, df = 7, p-value < 2.2e-16
# Convert categorical variables to factors
data_working_age <- data %>% filter(age <= 70 & age >= 18)
data_working_age$city <- as.factor(data_working_age$city)
data_working_age$occupation <- as.factor(data_working_age$occupation)
data_working_age$gender <- as.factor(data_working_age$gender)
# Fit the model
model1 <- lm(income ~ age + I(age^2) + city + occupation + gender, data = data_working_age)
# Create a beautiful model summary table
model_summary <- summary(model1)
kable(tidy(model_summary), caption = "Multiple Linear Regression Results (Working Age Only)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -103304.3227 | 5294.911397 | -19.51011 | 0 |
| age | 17996.5194 | 250.036068 | 71.97569 | 0 |
| I(age^2) | -100.0189 | 2.895668 | -34.54088 | 0 |
| cityDebrecen | -50555.7719 | 1708.534326 | -29.59014 | 0 |
| cityEger | -105861.9735 | 2441.761341 | -43.35476 | 0 |
| cityGyőr | -100901.8196 | 2201.479335 | -45.83364 | 0 |
| cityMiskolc | -98022.3650 | 1990.245618 | -49.25139 | 0 |
| cityPécs | -104132.3473 | 2195.486122 | -47.43020 | 0 |
| citySzeged | -54481.4626 | 1911.615619 | -28.50022 | 0 |
| citySzombathely | -97368.4232 | 2474.808146 | -39.34383 | 0 |
| occupationChef | -34050.3543 | 2937.191518 | -11.59283 | 0 |
| occupationDoctor | 63015.0930 | 3036.507567 | 20.75249 | 0 |
| occupationDriver | -45548.0684 | 2979.646552 | -15.28640 | 0 |
| occupationEngineer | 27022.3777 | 2381.063430 | 11.34887 | 0 |
| occupationManager | 44518.4166 | 2656.314580 | 16.75947 | 0 |
| occupationNurse | -24868.5619 | 2271.901452 | -10.94614 | 0 |
| occupationSales Representative | -55848.7747 | 2030.431916 | -27.50586 | 0 |
| occupationSoftware Developer | 92726.8736 | 2575.317522 | 36.00600 | 0 |
| occupationTeacher | -15803.4550 | 2157.111990 | -7.32621 | 0 |
| genderMale | 20841.9232 | 1119.996419 | 18.60892 | 0 |
# Enhanced model diagnostics
par(mfrow = c(2,2))
plot(model1, col = income_palette[1], pch = 19, cex = 0.7)# Fit polynomial regression
model2 <- lm(income ~ poly(age, 3), data = data_working_age)
# Create a static plot
ggplot(data_working_age, aes(x = age, y = income)) +
geom_point(alpha = 0.1, color = income_palette[1]) +
geom_smooth(method = "lm", formula = y ~ poly(x, 3),
color = income_palette[5], fill = income_palette[5], alpha = 0.2) +
labs(title = "Polynomial Regression: Age vs Income",
subtitle = "Cubic polynomial fit with confidence interval (Working Age Only)",
x = "Age",
y = "Income (HUF)") +
custom_theme# Model comparison
model_comparison <- data.frame(
Model = c("Multiple Linear", "Polynomial"),
R_squared = c(summary(model1)$r.squared, summary(model2)$r.squared),
Adj_R_squared = c(summary(model1)$adj.r.squared, summary(model2)$adj.r.squared)
)
kable(model_comparison, caption = "Model Comparison (Working Age Only)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Model | R_squared | Adj_R_squared |
|---|---|---|
| Multiple Linear | 0.9000774 | 0.8998350 |
| Polynomial | 0.7386924 | 0.7385925 |
# Create a sample prediction
new_data <- data.frame(
age = 35,
city = "Budapest",
occupation = "Software Developer",
gender = "Male"
)
# Predict income
prediction <- predict(model1, newdata = new_data, interval = "prediction")
print(prediction)## fit lwr upr
## 1 517619.5 420425.2 614813.8
The analysis of the simulated Hungarian income data reveals several interesting patterns:
The analysis demonstrates the complex interplay between various factors affecting income levels in Hungary. The findings suggest that both demographic factors (age, gender) and professional factors (occupation, location) play important roles in determining income levels.